In [ ]:
import spynner
import os, sys
from IPython.display import display, Image
In [ ]:
# 用 spynner 打開瀏覽器
browser = spynner.Browser(debug_level=spynner.ERROR, debug_stream=sys.stderr)
browser.show() # 告訴 browser,要它之後不要隱身
# 在 ?????? 填入適當網址
base_url = 'http://v.??????.com/online/comic-7340.html?ch='
book_no = 54
browser.load( base_url+str(book_no))
找出圖片的網址
In [ ]:
browser.load_jquery(True) # spynner 內建有 jquery,用這個 method 載入,比較方便。
img_url = browser.runjs('$("#TheImg").attr("src")')
print(img_url)
Image(img_url, width=200)
In [ ]:
# 當然不用 jquery 也可以
browser.runjs('document.getElementById("TheImg").getAttribute("src")')
有了網址, 可以用 urllib 抓下來。 也可以用 spynner 來抓。
In [ ]:
img_data = browser.download(img_url) # jpg 檔案內容
with open("cover.jpg", 'wb') as jpg_file:
jpg_file.write(img_data)
Image(data=img_data, width=200)
如何換頁?
In [ ]:
#總共頁數
total_pages = int(browser.runjs('ps'))
total_pages
開始回圈來抓圖吧
In [ ]:
for page in range(1, 1+total_pages):
browser.load("{}{}-{}".format(base_url, book_no, page))
img_url = browser.runjs('document.getElementById("TheImg").getAttribute("src")')
print(page, img_url)
with open("{}-{}.jpg".format(book_no, page), "wb") as f:
browser.download(img_url, outfd=f)
print("File saved in", os.getcwd())
In [ ]:
from PySide.QtWebKit import QWebSettings # 用來設定 QtWebKit
# 利用
import ipywidgets as W
In [ ]:
# 設定不自動顯示圖片
settings = browser.webview.settings()
settings.setAttribute(QWebSettings.AutoLoadImages, False)
In [ ]:
# 可以看到瀏覽器並不會讀進圖片
total_pages = int(browser.runjs('ps'))
for page in range(1, 1+10):
browser.load("{}{}-{}".format(base_url, book_no, page))
img_url = browser.runjs('document.getElementById("TheImg").getAttribute("src")')
print(page, img_url)
現在來建立一個介面
In [ ]:
# 建立 Image Widget 用來顯示圖片預覽
img = W.Image()
img.width = 300
# 顯示下載進度的 Progress bar
progress = W.IntProgress(min=1, value=1, max=total_pages)
display(img)
display(progress)
dir_name = os.path.join("download", "{:02d}".format(book_no))
if not os.path.exists(dir_name):
os.makedirs(dir_name)
print("Download to {}".format(os.path.join(os.getcwd(), dir_name)))
sys.stdout.flush()
# 開始下載
for page in range(1, total_pages+1):
# 取得 image url
browser.load("{}{}-{}".format(base_url, book_no, page))
img_url = browser.runjs('document.getElementById("TheImg").getAttribute("src")')
# 下載圖片
fn = os.path.join(dir_name, "{:03d}.jpg".format(page))
with open(fn, "wb") as f:
browser.download(img_url, outfd=f)
# 更新 Widget 的狀態
progress.description = "%d/%d"%(page, total_pages)
progress.value = page
img.value = Image(filename=fn).data
擋掉更多廣告
In [ ]:
from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest # 控制 browser 的網路連線
from PySide.QtCore import QUrl # Qt 的 Url 類別
In [ ]:
# 關掉之前的 browser
browser.close()
In [ ]:
browser = spynner.Browser(debug_level=spynner.ERROR, debug_stream=sys.stderr)
# 建立一個 webview
# 我們不設定 AutoLoadImages=False, 但增加一些其他設定
# 這裡並不是重點,但適合我們的應用
browser.create_webview()
settings = browser.webview.settings()
# settings.setAttribute(QWebSettings.AutoLoadImages, False)
settings.setAttribute(QWebSettings.JavaEnabled, False) # 不需要 Java
settings.setAttribute(QWebSettings.DnsPrefetchEnabled, True) # 試著節省 Dns 花的時間
settings.setAttribute(QWebSettings.PrivateBrowsingEnabled, True) # 不需要瀏覽紀錄
In [ ]:
# 建立一個空的 url
BLANK_REQUEST = QNetworkRequest(QUrl())
# 建立一個空的圖片 url
DUMMY_IMG_REQUEST = QNetworkRequest(QUrl(""))
# 因為只需要用一次,可以取個又臭又長的名字
class EightComicNetworkAccessManager(QNetworkAccessManager):
# 只需要取代 createRequest 這個 method 即可
def createRequest(self, op, request, device=None):
url = str(request.url().toString()) # 參數很多,但只取 url 就夠用
if 'comic' not in url[:20]:
# 用很醜的方式來判斷非 8comic 網站的 url
# 用空的 url 取代原本的 url
return QNetworkAccessManager.createRequest(self, self.GetOperation, BLANK_REQUEST)
elif not url.endswith('js') and not url.endswith('css') and '.html' not in url:
# 凡是 .js .css .html 之外的,都用空的圖片 url 取代原本的 url
return QNetworkAccessManager.createRequest(self, self.GetOperation, DUMMY_IMG_REQUEST)
else:
# 傳回原本的 url
return QNetworkAccessManager.createRequest(self, op, request, device)
# 設定 browser 的 NetworkAccessManager
browser.webpage.setNetworkAccessManager(EightComicNetworkAccessManager())
In [ ]:
browser.show()
browser.load(base_url+str(book_no))
total_pages = int(browser.runjs('ps'))
In [ ]:
%%timeit -n 1 -r 1
# 建立 Image Widget 用來顯示圖片預覽
img = W.Image()
img.width = 300
# 顯示下載進度的 Progress bar
progress = W.IntProgress(min=1, value=1, max=total_pages)
display(img)
display(progress)
dir_name = os.path.join("download", "{:02d}".format(book_no))
if not os.path.exists(dir_name):
os.makedirs(dir_name)
print("Download to {}".format(os.path.join(os.getcwd(), dir_name)))
sys.stdout.flush()
# 開始下載
for page in range(1, total_pages+1):
# 取得 image url
browser.load("{}{}-{}".format(base_url, book_no, page))
img_url = browser.runjs('document.getElementById("TheImg").getAttribute("src")')
# 下載圖片
fn = os.path.join(dir_name, "{:03d}.jpg".format(page))
with open(fn, "wb") as f:
browser.download(img_url, outfd=f)
# 更新 Widget 的狀態
progress.description = "%d/%d"%(page, total_pages)
progress.value = page
img.value = Image(filename=fn).data
利用 thread
In [ ]:
from urllib.request import urlopen
from multiprocessing.pool import ThreadPool
In [ ]:
%%timeit -n 1 -r 1
book_no = 63
browser.load(base_url+str(book_no))
total_pages = int(browser.runjs('ps'))
def save_img(img_url, page):
fn = os.path.join(dir_name, "{:03d}.jpg".format(page))
with urlopen(img_url) as img_src:
with open(fn, "wb") as f:
f.write(img_src.read())
# 更新 widget 的狀態
progress.value += 1
progress.description = "img: %d/%d"%(progress.value, total_pages)
img.value = "<img src='{}' height=300 />".format(fn)
# 建立 Image Widget 用來顯示圖片預覽
img = W.HTML()
# 顯示下載進度的 Progress bar
progress = W.IntProgress(min=1, value=1, max=total_pages)
display(progress)
display(img)
dir_name = os.path.join("download", "{:02d}".format(book_no))
if not os.path.exists(dir_name):
os.makedirs(dir_name)
print("Download to {}".format(os.path.join(os.getcwd(), dir_name)))
sys.stdout.flush()
pool = ThreadPool(5)
for page in range(1, total_pages+1):
# 取得 image url
browser.load("{}{}-{}".format(base_url, book_no, page))
img_url = browser.runjs('document.getElementById("TheImg").getAttribute("src")')
pool.apply_async(save_img, (img_url, page))
pool.close()
pool.join()